This is a guided analysis to show how to interpret (CD8 T cell) single-cell RNA-seq data using Seurat and other R packages. We use a public dataset of small size (~1K cells) in order to make fast calculations GEO GSE85947
Load packages and source files (custom functions)
#install.packages("Seurat")
library(Seurat)
source("functions.R")
Read expression matrix (\(log2 (TPM+1 )\))
if(!file.exists("input/GSE86028_TILs_sc_wt_mtko.tpm.log2.txt.gz")){
download.file("https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE86028&format=file&file=GSE86028_TILs_sc_wt_mtko.tpm.log2.txt.gz",destfile = "input/GSE86028_TILs_sc_wt_mtko.tpm.log2.txt.gz")
}
m <- read.csv(gzfile("input/GSE86028_TILs_sc_wt_mtko.tpm.log2.txt.gz"),row.names=1,sep=" ",header=T)
m[1:10,1:5]
Read meta data
meta <- read.csv(gzfile("input/GSE86028_TILs_sc_wt_mtko.index.txt.gz"),sep="\t",as.is=F,row.names=1)
meta$mouse_id = factor(meta$mouse_id)
meta$plate_uniq_num = factor(meta$plate_uniq_num)
meta$batch_bio = factor(meta$batch_bio)
head(meta)
str(meta)
'data.frame': 1061 obs. of 6 variables:
$ cell_type : Factor w/ 1 level "CD8": 1 1 1 1 1 1 1 1 1 1 ...
$ condition_plate: Factor w/ 2 levels "MTKO","WT": 1 1 1 1 2 2 1 1 2 2 ...
$ seq_id : Factor w/ 4 levels "AH3VMFBGXY","H3332BGXY",..: 1 1 1 1 1 4 1 1 1 1 ...
$ batch_bio : Factor w/ 2 levels "1","2": 2 2 2 2 2 2 2 2 2 2 ...
$ mouse_id : Factor w/ 9 levels "1","2","3","4",..: 7 7 7 7 3 3 8 7 3 3 ...
$ plate_uniq_num : Factor w/ 16 levels "1","2","3","4",..: 1 1 1 1 2 3 4 1 2 2 ...
Create Seurat object
data.seurat <- CreateSeuratObject(m, meta.data = meta, project = "tumorTILS", min.cells = 3, min.genes = 500, is.expr = 0,do.scale=F, do.center=F)
rm(m,meta)
Explore samples
table(data.seurat@meta.data$mouse_id)
1 2 3 4 5 6 7 8 9
130 158 108 138 78 80 133 106 130
summary(data.seurat@meta.data$nGene)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1532 2239 2891 3079 3787 5960
summary(data.seurat@meta.data$nUMI)
Min. 1st Qu. Median Mean 3rd Qu. Max.
8959 13733 17073 17835 21256 32145
Caclulate ribosomal content (might indicate techincal variability) (same for mitochondrial content)
ribo.genes <- grep(pattern = "^Rp[ls]", x = rownames(x = data.seurat@data), value = TRUE)
percent.ribo <- Matrix::colSums(data.seurat@raw.data[ribo.genes, ])/Matrix::colSums(data.seurat@raw.data)
summary(percent.ribo)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.02911 0.04271 0.05150 0.05308 0.06220 0.09850
Add new ‘metadata’ to our Seurat object
data.seurat <- AddMetaData(object = data.seurat, metadata = percent.ribo, col.name = "percent.ribo")
print(paste("matrix dim",dim(data.seurat@data)))
[1] "matrix dim 14314" "matrix dim 1061"
In this dataset ‘high quality’ cells were already selected, however we might want to filter further based on these parameters
hist(data.seurat@meta.data$nGene)
hist(data.seurat@meta.data$nUMI)
hist(data.seurat@meta.data$percent.ribo)
Note that Seurat automatically calculates ‘nUMI’ per cell, assuming @data matrix contains UMI counts. In this case however, ‘nUMI’ contains the sum of the normalized expression values (log2 TPM) for each cell
Look at distributions per sample
data.seurat <- SetAllIdent(object = data.seurat, id = "mouse_id")
VlnPlot(object = data.seurat, features.plot = c("nGene", "nUMI","percent.ribo"), nCol = 2,point.size.use=0.001)
print(paste("matrix dim",dim(data.seurat@data)))
[1] "matrix dim 14314" "matrix dim 1061"
data.seurat <- FilterCells(object = data.seurat, subset.names = c("nGene", "nUMI","percent.ribo"),
low.thresholds = c(1500, 8000, -Inf), high.thresholds = c(6000, 30000,0.1))
print(paste("matrix dim",dim(data.seurat@data)))
[1] "matrix dim 14314" "matrix dim 1056"
There are two groups of mice, Wildtype (WT) and MT KO
table(data.seurat@meta.data$condition_plate)
MTKO WT
527 529
Will analyze only cells from WT mice
data.seurat <- SubsetData(data.seurat,cells.use = data.seurat@meta.data$condition_plate=="WT")
table(data.seurat@meta.data$mouse_id)
1 2 3 4 5 6 7 8 9
130 158 108 133 0 0 0 0 0
Extract highly variable genes
set.seed(1234)
data.seurat <- FindVariableGenes(
object = data.seurat,
mean.function = ExpMean,
dispersion.function = LogVMR,
x.low.cutoff = 1,
x.high.cutoff = 15,
y.cutoff = 1
)
Calculating gene means
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variance to mean ratios
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
length(data.seurat@var.genes)
[1] 1616
head(row.names(data.seurat@hvg.info)[order(data.seurat@hvg.info$gene.mean,decreasing = T)],n=20)
Check expression values of a few genes
VlnPlot(data.seurat,features.plot = qq(Gapdh,Actg1,Actb,Ptprc,Cd2,Cd3g,Cd8a,Cd8b1), point.size.use = exp(-3), x.lab.rot=T)
Some variation but no evident systematic biases
Looking at expression values
plot(density(as.numeric(data.seurat@data["Gzmb",])))
Scale data (useful for example to do PCA on scaled variables, to make heatmaps, etc)
data.seurat <- ScaleData(
object = data.seurat, do.scale=T, do.center = T
)
NormalizeData has not been run, therefore ScaleData is running on non-normalized values. Recommended workflow is to run NormalizeData first.
ScaleData is running on non-normalized values. Recommended workflow is to run NormalizeData first.
Scaling data matrix
|
| | 0%
|
|================================================================================================================================================================| 100%
plot(density(as.numeric(data.seurat@scale.data["Gzmb",])))
PCA on highly variable genes
set.seed(1234)
data.seurat <- RunPCA(object = data.seurat, pc.genes = data.seurat@var.genes, do.print = TRUE, pcs.print = 1:5,
genes.print = 10)
[1] "PC1"
[1] "Pde2a" "Pydc3" "Fam65b" "Gramd3" "Itgb7" "Sell" "Zfp36l2" "Cd7" "Cxcr3" "Pik3ip1"
[1] ""
[1] "Cks1b" "Tpi1" "Birc5" "Ube2c" "Txn1" "Rrm2" "Ccnb2" "Hist1h2ao" "Cst3" "Cd74"
[1] ""
[1] ""
[1] "PC2"
[1] "Nkg7" "Lag3" "Gzmb" "Prf1" "Ctla4" "Ccl3" "Ccnb2" "Hilpda" "Ifng" "Impa2"
[1] ""
[1] "Ifi205" "Fgd2" "Csf2rb" "Ly86" "Rnase6" "Aif1" "Tbc1d8" "Plbd1" "Cd24a" "Ctsh"
[1] ""
[1] ""
[1] "PC3"
[1] "Ccr7" "Sell" "Xcl1" "Dapl1" "Lad1" "Gpr18" "Rgcc" "Cd69" "Lta" "Fos"
[1] ""
[1] "Gzmb" "Id2" "Ccl4" "Lgals3" "Prf1" "Ccl3" "Nkg7" "AW112010" "Actg1" "Hilpda"
[1] ""
[1] ""
[1] "PC4"
[1] "Rrm2" "Spc24" "Hist2h3c2" "Hist2h3b" "Birc5" "Kif22" "Aurka" "Fam83d" "Pkmyt1" "Hist1h2ao"
[1] ""
[1] "Cd70" "Cd83" "Xcl1" "Ccl1" "Lad1" "Cd81" "Plk2" "Ccr8" "Slc2a6" "Lta"
[1] ""
[1] ""
[1] "PC5"
[1] "Laptm4b" "Ccl22" "Oasl1" "Fscn1" "Ptafr" "Gcnt2" "Ebi3" "Cacnb3" "Mreg" "Isg15"
[1] ""
[1] "Apoe" "C1qa" "Lyz2" "C1qc" "C1qb" "Fcgr4" "Plbd1" "Cd300lf" "Selenbp1" "Emr1"
[1] ""
[1] ""
We explore the genes with highers loading to have an idea of which ones are driving global variation (e.g. important T cell genes as expected, such as Sell, Lag3, Gzmb, Prf1; cell cycle genes such as Ccnb2, Cks1b, etc )
PCElbowPlot(object = data.seurat)
Check distribution per sample (batch effects?)
data.seurat <- SetAllIdent(object = data.seurat, id = "mouse_id")
PCAPlot(object = data.seurat)
Check distribution of important genes for our subset
FeaturePlot(data.seurat,features.plot=c("Ptprc","Cd2","Cd8a","Cd8b1","Cd4"),reduction.use="pca",cols.use = c("lightgrey",
"blue"),nCol=2,no.legend = F,do.return=T)
$Ptprc
$Cd2
$Cd8a
$Cd8b1
$Cd4
Cell cycle is typically a strong source of transcriptomic variation
FeaturePlot(data.seurat,features.plot=c("Mki67","Ccna2","Cks1b"),reduction.use="pca",cols.use = c("lightgrey",
"blue"),nCol=2,no.legend = F,do.return=T)
$Mki67
$Ccna2
$Cks1b
Number of detected genes/reads count per cell associated to amount af mRNA: might be associated to biology (eg cell size, transcriptional activity), cell quality, technical effects, etc
FeaturePlot(data.seurat,features.plot=c("nUMI","nGene","percent.ribo"),reduction.use="pca",cols.use = c("lightgrey",
"blue"),nCol=2,no.legend = F,do.return=T)
$nUMI
$nGene
$percent.ribo
There are some outliers. Not interested (unless looking for rare cell states)
Spot genes among top genes contributing to PCs which might be associated to outliers
FeaturePlot(data.seurat,features.plot=c("H2-Eb1","Ctsh","H2-Aa","H2-Ab1","Csf2rb"),reduction.use="pca",cols.use = c("lightgrey",
"blue"),nCol=2,no.legend = F,do.return=T)
$`H2-Eb1`
$Ctsh
$`H2-Aa`
$`H2-Ab1`
$Csf2rb
Indeed, these outliers express high levels of MHC II genes, Csf2rb, etc. suggesting these might be contaminating myeloid/APCs
Filter out outliers based on expression (or lack of) of a few genes (you might try multivariate filtering criteria as well)
data.seurat <- FilterCells(object = data.seurat, subset.names = c("Cd2", "Cd8a","Cd8b1","Cd4","Ctsh"),
low.thresholds = c(1, 1, 1,-Inf,-Inf), high.thresholds = c(Inf,Inf,Inf,exp(-10),exp(-10)))
print(paste("matrix dim",dim(data.seurat@data)))
[1] "matrix dim 14314" "matrix dim 398"
FeaturePlot(data.seurat,features.plot=c("Ptprc","Cd2","Cd8a","Cd8b1","H2-Aa","Csf2rb"),reduction.use="pca",cols.use = c("lightgrey",
"blue"),nCol=2,no.legend = F,do.return=T)
$Ptprc
$Cd2
$Cd8a
$Cd8b1
$`H2-Aa`
$Csf2rb
Now it looks better
Also there is a strong effect from cell cycle genes, we might want to remove them from our list of highly variable genes used for dimensionality
Get list of cell cycle genes (map to Ensembl IDs)
#BiocManager::install("org.Mm.eg.db",type="source")
#BiocManager::install("clusterProfiler")
#BiocManager::install("DO.db",type="source")
#BiocManager::install("GO.db",type="source")
#signatureList <- readRDS("input/signatureList.rds")
library(org.Mm.eg.db)
library(clusterProfiler)
idMap <- bitr( rownames(data.seurat@data) , fromType = "SYMBOL", toType = c("ENTREZID"), OrgDb = org.Mm.eg.db)
cellCycle <- mget(c("GO:0007049"),org.Mm.egGO2ALLEGS)
cellCycle <- unique(unlist(cellCycle))
cellCycle.symbol <- unique(na.omit(idMap$SYMBOL[match(cellCycle,idMap$ENTREZID)]))
length(cellCycle.symbol)
[1] 1293
print(length(data.seurat@var.genes))
[1] 1616
data.seurat@var.genes <- setdiff(data.seurat@var.genes,cellCycle.symbol)
print(length(data.seurat@var.genes))
[1] 1494
PCA
data.seurat <- ScaleData(object = data.seurat, do.scale=T, do.center = T) #since we filtered out cells, scaling might have slightly changed
NormalizeData has not been run, therefore ScaleData is running on non-normalized values. Recommended workflow is to run NormalizeData first.
ScaleData is running on non-normalized values. Recommended workflow is to run NormalizeData first.
Scaling data matrix
|
| | 0%
|
|================================================================================================================================================================| 100%
set.seed(1234)
data.seurat <- RunPCA(object = data.seurat, pc.genes = data.seurat@var.genes, do.print = TRUE, pcs.print = 1:5,
genes.print = 10)
[1] "PC1"
[1] "Pydc3" "Pde2a" "Itgb7" "Fam65b" "Sell" "Cxcr3" "Gramd3" "Cd7" "Pik3ip1" "Igfbp4"
[1] ""
[1] "Tpi1" "Lag3" "Impa2" "Txn1" "Gapdh" "Bcl2a1d" "Rrm2" "Hist1h2ao" "Gzmc" "Mt1"
[1] ""
[1] ""
[1] "PC2"
[1] "Ccr7" "Sell" "Xcl1" "Lad1" "Dapl1" "Cd83" "Slc2a6" "Fos" "Cd69" "Cd81"
[1] ""
[1] "Gzmb" "Ccl4" "Nkg7" "Prf1" "Lgals3" "Actg1" "Ccl3" "AW112010" "AA467197" "Serpina3g"
[1] ""
[1] ""
[1] "PC3"
[1] "Ly6c2" "Rrm2" "Hist2h3c2" "Hist2h3b" "Sell" "Nrm" "Tgtp1" "Pde2a" "Mef2d" "Hist1h2ao"
[1] ""
[1] "Cd83" "Cd81" "Cd70" "Ccr8" "Ccl1" "Xcl1" "Lag3" "Bcl2a1d" "Slc2a6" "Idi2"
[1] ""
[1] ""
[1] "PC4"
[1] "Rrp9" "Utf1" "Apoe" "2410002F23Rik" "Atf3" "Prdx4" "H2-Eb1" "Rrm2" "Rps4y2" "Hsph1"
[1] ""
[1] "Lars2" "Gm16894" "Zc3h12a" "Lgals9" "Mcoln1" "Unc93b1" "Foxo3" "Kdelc2" "Irf7" "Igflr1"
[1] ""
[1] ""
[1] "PC5"
[1] "Gzmd" "Gzmg" "Gzme" "S1pr5" "Gzmf" "Klrg1" "Gzma" "Gzmc" "Dtx1" "Serpinb9b"
[1] ""
[1] "Hist2h3c2" "Hist2h3b" "Ctla4" "Ifi27l2a" "Rtp4" "Rrm2" "Hist1h2ao" "AW112010" "Ifit3" "Cxcl10"
[1] ""
[1] ""
PCElbowPlot(object = data.seurat)
Clustering Good old hierarchical clustering
ndim <- 10
set.seed(1234)
fit.hclust=hclust(dist(data.seurat@dr$pca@cell.embeddings[,1:ndim]),method="ward.D2")
Checking silhouette using different number of clusters
library(cluster)
sils=c()
for (cc in 2:10){
clusters.cc=cutree(fit.hclust,k=cc)
sil.c=summary(silhouette(x=as.numeric(clusters.cc),dist=dist(data.seurat@dr$pca@cell.embeddings[,1:ndim])))
sils=c(sils,sil.c$avg.width)
}
plot(c(2:10),sils)
nclus <- 5
clusters=factor(cutree(fit.hclust,k=nclus))
data.seurat@ident <- clusters
data.seurat <- StashIdent(object = data.seurat, save.name = "wardClustering")
data.seurat <- SetAllIdent(object = data.seurat, id = "wardClustering")
PCAPlot(object = data.seurat)
tSNE
data.seurat <- RunTSNE(object = data.seurat, dims.use = 1:ndim, do.fast = F, seed.use=123, perplexity=30)
TSNEPlot(object = data.seurat, do.return = TRUE, no.legend = TRUE, do.label = TRUE, group.by = "wardClustering")
Alternative clustering implemented in Seruat: shared nearest neighbor (SNN) modularity optimization based clustering algorithm
set.seed(12345)
data.seurat <- FindClusters(object = data.seurat, reduction.type = "pca", dims.use = 1:ndim,
resolution = 0.5, print.output = 0, save.SNN = TRUE, force.recalc = T, k.param = 10)
plot1 <- TSNEPlot(object = data.seurat, do.return = TRUE, no.legend = TRUE, do.label = TRUE, group.by = "res.0.5")
plot2 <- TSNEPlot(object = data.seurat, do.return = TRUE, group.by = "wardClustering",
no.legend = TRUE, do.label = TRUE)
plot_grid(plot1, plot2)
data.seurat <- SetAllIdent(object = data.seurat, id = "res.0.5")
data.seurat@meta.data$cluster <- data.seurat@meta.data$wardClustering
Good correspondence ###
###
K-Means
set.seed(1234)
clusters.kmeans <- kmeans(data.seurat@dr$pca@cell.embeddings[,1:ndim],6,nstart = 10)
data.seurat@ident <- factor(clusters.kmeans$cluster)
data.seurat <- StashIdent(object = data.seurat, save.name = "kmeansClustering")
TSNEPlot(object = data.seurat, do.return = TRUE, group.by = "kmeansClustering",
no.legend = TRUE, do.label = TRUE)
#ggsave("out/tSNE.0.pdf",width = 3,height = 3)
Also with KMeans
table(data.seurat@meta.data$wardClustering,data.seurat@meta.data$cluster)
# Rand index for cluster agreement
#install.packages("mclust")
library(mclust)
adjustedRandIndex(data.seurat@meta.data$wardClustering, data.seurat@meta.data$cluster)
adjustedRandIndex(data.seurat@meta.data$kmeans, data.seurat@meta.data$cluster)
adjustedRandIndex(data.seurat@meta.data$kmeans, data.seurat@meta.data$wardClustering)
#ward clustering has more agreement with the other two
Checking co-expression of important genes
p1 <- plot2geneSeuratTSNE(data.seurat,"Pdcd1","Tcf7")
p2 <- plot2geneSeuratTSNE(data.seurat,"Gzmb","Ccr7")
p3 <- plot2geneSeuratTSNE(data.seurat,"Lag3","Sell")
p4 <- plot2geneSeuratTSNE(data.seurat,"Havcr2","Entpd1")
plot_grid(p1,p2,p3,p4)
#ggsave("out/tSNE.markers.pdf",width = 10,height=8)
plot2geneSeuratTSNE(data.seurat,"Pdcd1","Tcf7")
plot2geneSeuratTSNE(data.seurat,"Mki67","Ccna2")
Sample distribution (biological variablity +batch effect)
for (s in levels(factor(data.seurat@meta.data$mouse_id))) {
mycol <- data.seurat@meta.data$mouse_id==s
print(ggplot(data=data.frame(data.seurat@dr$tsne@cell.embeddings), aes(x=tSNE_1,y=tSNE_2, color="gray")) +
geom_point(color="gray",alpha=0.5) +
geom_point(data=data.frame(data.seurat@dr$tsne@cell.embeddings[mycol,]),alpha=0.7, color="blue") +
theme_bw() + theme(aspect.ratio = 1, legend.position="right") + xlab("TSNE 1") + ylab("TSNE 2") + ggtitle(s))
}
classification of CD8 TIL states using TILPRED
#install.packages("BiocManager")
#install.packages("doParallel")
#install.packages("doRNG")
#BiocManager::install("GenomeInfoDbData",type="source")
#BiocManager::install("AUCell")
#BiocManager::install("SingleCellExperiment",type="source")
#install.packages("remotes")
#remotes::install_github("carmonalab/TILPRED")
library(SingleCellExperiment)
library(AUCell)
library(TILPRED)
data.sce <- Convert(data.seurat,to="sce")
data.sce.pred <- predictTilState(data.sce)
The following genes were not found Gm10282,Cks1brt,Pcna-ps2 . Unknown prediction performance.Genes in the gene sets NOT available in the dataset:
NaiveVsExhausted_down: 1 (1% of 97)
cycling: 2 (2% of 94)
table(data.sce.pred$predictedState)
Naive Effector MemoryLike Exhausted unknown
46 90 68 184 10
str(data.sce.pred)
List of 3
$ predictedState : Factor w/ 5 levels "Naive","Effector",..: 5 4 4 4 4 3 2 4 4 4 ...
..- attr(*, "names")= chr [1:398] "AH3VMFBGXY_A10_S10" "AH3VMFBGXY_A4_S4" "AH3VMFBGXY_A6_S6" "AH3VMFBGXY_A7_S7" ...
$ stateProbabilityMatrix: num [1:398, 1:4] 1.61e-02 2.57e-03 1.00e-06 1.38e-05 5.46e-05 ...
..- attr(*, "dimnames")=List of 2
.. ..$ : chr [1:398] "AH3VMFBGXY_A10_S10" "AH3VMFBGXY_A4_S4" "AH3VMFBGXY_A6_S6" "AH3VMFBGXY_A7_S7" ...
.. ..$ : chr [1:4] "Naive" "Effector" "MemoryLike" "Exhausted"
$ cycling : Named logi [1:398] FALSE FALSE FALSE FALSE FALSE FALSE ...
..- attr(*, "names")= chr [1:398] "AH3VMFBGXY_A10_S10" "AH3VMFBGXY_A4_S4" "AH3VMFBGXY_A6_S6" "AH3VMFBGXY_A7_S7" ...
head(data.sce.pred$stateProbabilityMatrix)
Naive Effector MemoryLike Exhausted
AH3VMFBGXY_A10_S10 1.608488e-02 2.974940e-01 0.359993363 0.3264278
AH3VMFBGXY_A4_S4 2.567036e-03 4.726579e-01 0.019469265 0.5053058
AH3VMFBGXY_A6_S6 1.003015e-06 1.676011e-06 0.001023729 0.9989736
AH3VMFBGXY_A7_S7 1.378348e-05 1.564267e-04 0.004174986 0.9956548
AH3VMFBGXY_A9_S9 5.460452e-05 9.523511e-04 0.009209985 0.9897831
AH3VMFBGXY_B11_S23 2.596244e-04 8.337796e-04 0.817061148 0.1818454
data.seurat <- AddMetaData(object = data.seurat, metadata = data.sce.pred$predictedState, col.name = "state.pred")
data.seurat <- AddMetaData(object = data.seurat, metadata = data.sce.pred$cycling, col.name = "cycling")
stateColorsPred <- c("#F8766D","#A58AFF","#00B6EB","#53B400","#000000")
names(stateColorsPred) <- qq(Naive,Effector,MemoryLike,Exhausted,NP)
TSNEPlot(object = data.seurat, do.return = TRUE, no.legend = TRUE, do.label = TRUE, group.by = "state.pred", colors.use =stateColorsPred)
TSNEPlot(object = data.seurat, do.return = TRUE, no.legend = TRUE, do.label = TRUE, group.by = "cycling")
plot1 <- TSNEPlot(object = data.seurat, do.return = TRUE, no.legend = TRUE, do.label = TRUE, group.by = "state.pred", colors.use =stateColorsPred)
plot2 <- TSNEPlot(object = data.seurat, do.return = TRUE, no.legend = TRUE, do.label = TRUE, group.by = "cycling")
plot_grid(plot1, plot2)
plot1 <- TSNEPlot(object = data.seurat, do.return = TRUE, no.legend = TRUE, do.label = TRUE, group.by = "state.pred", colors.use =stateColorsPred)
plot2 <- TSNEPlot(object = data.seurat, do.return = TRUE, no.legend = TRUE, do.label = TRUE, group.by = "cluster")
plot_grid(plot1, plot2)
Explore association of clusters and predicted states
library(pheatmap)
library(RColorBrewer)
a <- table(data.seurat@meta.data$state.pred,data.seurat@meta.data$cluster)
a.scaled <- scale(a,scale = colSums(a),center=F)*100
a.scaled
1 2 3 4 5
Naive 2.727273 0.000000 0.000000 2.325581 84.000000
Effector 69.090909 6.349206 0.000000 4.651163 8.000000
MemoryLike 3.636364 7.936508 23.188406 86.046512 2.000000
Exhausted 20.909091 84.126984 75.362319 6.976744 0.000000
unknown 3.636364 1.587302 1.449275 0.000000 6.000000
pheatmap(round(a.scaled),cexCol = 0.8,cluster_cols = F)
Cycling cells per cluster
barplot(tapply(data.seurat@meta.data$cycling,data.seurat@meta.data$cluster,mean))
clusterColors <- c("#F8766D","#A58AFF","#00B6EB","#53B400","#FF00FF")
names(clusterColors) <- c(5,1,4,2,3)
#BiocManager::install("MAST")
data.seurat <- SetAllIdent(object = data.seurat, id = "cluster")
cd8.markers <- FindAllMarkers(object = data.seurat, only.pos = TRUE, min.pct = 0.1, min.diff.pct=0.1, logfc.threshold = 0.25, test.use = "bimod",max.cells.per.ident = Inf)#Try MAST
cd8.markers.list <- split(cd8.markers,cd8.markers$cluster)
cd8.markers.list <- lapply((cd8.markers.list),function(x) x$gene)
lapply(cd8.markers.list,head)
$`1`
[1] "Ccl5" "Zfp300" "Arl6ip4" "Atp6v1g1" "Eif3e" "Faim"
$`2`
[1] "Lag3" "Litaf" "Id2" "S100a11" "Anxa2" "Lgals1"
$`3`
[1] "2810417H13Rik" "Hmmr" "Mcm5" "Stmn1" "Srsf6" "Birc5"
$`4`
[1] "Zfp202" "Cd83" "Ccr7" "Ifi27l2a" "Sspo" "Rgcc"
$`5`
[1] "Sell" "Cnr2" "Rapgef4" "Tcf7" "S1pr1" "Lef1"
set.seed(1234)
geneSample <- unique(unlist(lapply(cd8.markers.list,function(x) { head(x,n=10) })))
clusterList <- split(row.names(data.seurat@meta.data),data.seurat@meta.data$cluster)
cellSample <- unlist(lapply(clusterList,function(x) { head(x,n=25) }))
ann_col <- data.frame(cluster = data.seurat@meta.data$cluster)
row.names(ann_col) <- row.names(data.seurat@meta.data)
ann_colors <- list(cluster = clusterColors)
pheatmap(data.seurat@data[geneSample,cellSample], cluster_rows = T, cluster_cols = F, labels_col = "",annotation_col = ann_col[cellSample,,drop=F],scale = "row", clustering_distance_rows="correlation", annotation_colors = ann_colors)
myMarkers <- qq(Sell,Il7r,Lef1,S1pr1,Ccr7,Tcf7,Cd44,Ccl5,Gzmk,Cxcr3,Gzmb,Fasl,Prf1,Tox,Batf,Pdcd1,Lag3,Tigit,Havcr2,Entpd1,Mki67,Ccna2)
data.seurat <- SetAllIdent(object = data.seurat, id = "cluster")
DotPlot(data.seurat,genes.plot = myMarkers,x.lab.rot = 1,plot.legend =T,cols.use = "RdYlGn")
#ggsave("out/DotPlot.pdf",width = 10, height = 3)
Define a few example gene signatures
signatureList <- list()
signatureList$Cytotoxicity <- c("Prf1","Fasl","Gzmb")
signatureList$Stemness <- c("Tcf7","Sell","Il7r","Lef1")
signatureList$InhibitoryReceptors <- c("Pdcd1","Havcr2","Tigit","Lag3","Ctla4")
signatureList$cellCycleGenes <- cellCycle.symbol
Use AUCell package to calculate signature enrichment
#BiocManager::install("AUCell")
library(AUCell)
set.seed(1234)
cells_rankings <- AUCell::AUCell_buildRankings(as.matrix(data.seurat@data), nCores=2, plotStats=TRUE)
min 1% 5% 10% 50% 100%
1576.00 1595.94 1719.00 1885.80 2901.50 5786.00
cells_AUC <- AUCell_calcAUC(signatureList, cells_rankings, aucMaxRank=1500)
cells_AUC
AUC for 4 gene sets (rows) and 398 cells (columns).
Top-left corner of the AUC matrix:
cells
gene sets AH3VMFBGXY_A10_S10 AH3VMFBGXY_A4_S4 AH3VMFBGXY_A6_S6 AH3VMFBGXY_A7_S7 AH3VMFBGXY_A9_S9
Cytotoxicity 0.44570538 0.9178905 0.7725857 0.7405429 0.6030263
Stemness 0.14173623 0.0000000 0.0000000 0.0000000 0.0000000
InhibitoryReceptors 0.63233133 0.7472278 0.7910488 0.6120240 0.6452906
cellCycleGenes 0.09842882 0.1018561 0.1251876 0.1054429 0.1094186
data.seurat@meta.data <- data.seurat@meta.data[!grepl("^AUC",colnames(data.seurat@meta.data))]
data.seurat <- AddMetaData(data.seurat, metadata = t(getAUC(cells_AUC)), col.name = paste0("AUC_",rownames(cells_AUC)))
for (s in names(signatureList)) {
print(myFeaturePlotAUC(data.seurat,s))
}
for (s in names(signatureList)) {
print(VlnPlot(object = data.seurat, features.plot = paste0("AUC_",s), point.size.use=0.001, cols.use=clusterColors, group.by = "cluster"))
}